Imported Lib¶

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load datasets¶

In [2]:
benin_df = pd.read_csv('data/benin-malanville.csv')
sl_df = pd.read_csv('data/sierraleone-bumbuna.csv')
tg_df = pd.read_csv('data/togo-dapaong_qc.csv')

Display first rows¶

In [3]:
print("Benin Data:")
print(benin_df.head())
print("\nSierra Leone Data:")
print(sl_df.head())
print("\nTogo Data:")
print(tg_df.head())
Benin Data:
          Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0  2021-08-09 00:01 -1.2 -0.2 -1.1   0.0   0.0  26.2  93.4  0.0     0.4   
1  2021-08-09 00:02 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.6  0.0     0.0   
2  2021-08-09 00:03 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.7  0.3     1.1   
3  2021-08-09 00:04 -1.1 -0.1 -1.0   0.0   0.0  26.2  93.3  0.2     0.7   
4  2021-08-09 00:05 -1.0 -0.1 -1.0   0.0   0.0  26.2  93.3  0.1     0.7   

   WSstdev     WD  WDstdev   BP  Cleaning  Precipitation  TModA  TModB  \
0      0.1  122.1      0.0  998         0            0.0   26.3   26.2   
1      0.0    0.0      0.0  998         0            0.0   26.3   26.2   
2      0.5  124.6      1.5  997         0            0.0   26.4   26.2   
3      0.4  120.3      1.3  997         0            0.0   26.4   26.3   
4      0.3  113.2      1.0  997         0            0.0   26.4   26.3   

   Comments  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4       NaN  

Sierra Leone Data:
          Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0  2021-10-30 00:01 -0.7 -0.1 -0.8   0.0   0.0  21.9  99.1  0.0     0.0   
1  2021-10-30 00:02 -0.7 -0.1 -0.8   0.0   0.0  21.9  99.2  0.0     0.0   
2  2021-10-30 00:03 -0.7 -0.1 -0.8   0.0   0.0  21.9  99.2  0.0     0.0   
3  2021-10-30 00:04 -0.7  0.0 -0.8   0.0   0.0  21.9  99.3  0.0     0.0   
4  2021-10-30 00:05 -0.7 -0.1 -0.8   0.0   0.0  21.9  99.3  0.0     0.0   

   WSstdev   WD  WDstdev    BP  Cleaning  Precipitation  TModA  TModB  \
0      0.0  0.0      0.0  1002         0            0.0   22.3   22.6   
1      0.0  0.0      0.0  1002         0            0.0   22.3   22.6   
2      0.0  0.0      0.0  1002         0            0.0   22.3   22.6   
3      0.0  0.0      0.0  1002         0            0.1   22.3   22.6   
4      0.0  0.0      0.0  1002         0            0.0   22.3   22.6   

   Comments  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4       NaN  

Togo Data:
          Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0  2021-10-25 00:01 -1.3  0.0  0.0   0.0   0.0  24.8  94.5  0.9     1.1   
1  2021-10-25 00:02 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.1     1.6   
2  2021-10-25 00:03 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.2     1.4   
3  2021-10-25 00:04 -1.2  0.0  0.0   0.0   0.0  24.8  94.3  1.2     1.6   
4  2021-10-25 00:05 -1.2  0.0  0.0   0.0   0.0  24.8  94.0  1.3     1.6   

   WSstdev     WD  WDstdev   BP  Cleaning  Precipitation  TModA  TModB  \
0      0.4  227.6      1.1  977         0            0.0   24.7   24.4   
1      0.4  229.3      0.7  977         0            0.0   24.7   24.4   
2      0.3  228.5      2.9  977         0            0.0   24.7   24.4   
3      0.3  229.1      4.6  977         0            0.0   24.7   24.4   
4      0.4  227.5      1.6  977         0            0.0   24.7   24.4   

   Comments  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4       NaN  

Check data types and missing values¶

In [4]:
print("\nBenin Info:")
print(benin_df.info())
print("\nSierra Leone Info:")
print(sl_df.info())
print("\nTogo Info:")
print(tg_df.info())
Benin Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525600 entries, 0 to 525599
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Timestamp      525600 non-null  object 
 1   GHI            525600 non-null  float64
 2   DNI            525600 non-null  float64
 3   DHI            525600 non-null  float64
 4   ModA           525600 non-null  float64
 5   ModB           525600 non-null  float64
 6   Tamb           525600 non-null  float64
 7   RH             525600 non-null  float64
 8   WS             525600 non-null  float64
 9   WSgust         525600 non-null  float64
 10  WSstdev        525600 non-null  float64
 11  WD             525600 non-null  float64
 12  WDstdev        525600 non-null  float64
 13  BP             525600 non-null  int64  
 14  Cleaning       525600 non-null  int64  
 15  Precipitation  525600 non-null  float64
 16  TModA          525600 non-null  float64
 17  TModB          525600 non-null  float64
 18  Comments       0 non-null       float64
dtypes: float64(16), int64(2), object(1)
memory usage: 76.2+ MB
None

Sierra Leone Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525600 entries, 0 to 525599
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Timestamp      525600 non-null  object 
 1   GHI            525600 non-null  float64
 2   DNI            525600 non-null  float64
 3   DHI            525600 non-null  float64
 4   ModA           525600 non-null  float64
 5   ModB           525600 non-null  float64
 6   Tamb           525600 non-null  float64
 7   RH             525600 non-null  float64
 8   WS             525600 non-null  float64
 9   WSgust         525600 non-null  float64
 10  WSstdev        525600 non-null  float64
 11  WD             525600 non-null  float64
 12  WDstdev        525600 non-null  float64
 13  BP             525600 non-null  int64  
 14  Cleaning       525600 non-null  int64  
 15  Precipitation  525600 non-null  float64
 16  TModA          525600 non-null  float64
 17  TModB          525600 non-null  float64
 18  Comments       0 non-null       float64
dtypes: float64(16), int64(2), object(1)
memory usage: 76.2+ MB
None

Togo Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525600 entries, 0 to 525599
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Timestamp      525600 non-null  object 
 1   GHI            525600 non-null  float64
 2   DNI            525600 non-null  float64
 3   DHI            525600 non-null  float64
 4   ModA           525600 non-null  float64
 5   ModB           525600 non-null  float64
 6   Tamb           525600 non-null  float64
 7   RH             525600 non-null  float64
 8   WS             525600 non-null  float64
 9   WSgust         525600 non-null  float64
 10  WSstdev        525600 non-null  float64
 11  WD             525600 non-null  float64
 12  WDstdev        525600 non-null  float64
 13  BP             525600 non-null  int64  
 14  Cleaning       525600 non-null  int64  
 15  Precipitation  525600 non-null  float64
 16  TModA          525600 non-null  float64
 17  TModB          525600 non-null  float64
 18  Comments       0 non-null       float64
dtypes: float64(16), int64(2), object(1)
memory usage: 76.2+ MB
None

Summary statistics¶

In [5]:
print("\nBenin Summary:")
print(benin_df.describe())
print("\nSierra Leone Summary:")
print(sl_df.describe())
print("\nTogo Summary:")
print(tg_df.describe())
Benin Summary:
                 GHI            DNI            DHI           ModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      240.559452     167.187516     115.358961     236.589496   
std       331.131327     261.710501     158.691074     326.894859   
min       -12.900000      -7.800000     -12.600000       0.000000   
25%        -2.000000      -0.500000      -2.100000       0.000000   
50%         1.800000      -0.100000       1.600000       4.500000   
75%       483.400000     314.200000     216.300000     463.700000   
max      1413.000000     952.300000     759.200000    1342.300000   

                ModB           Tamb             RH             WS  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      228.883576      28.179683      54.487969       2.121113   
std       316.536515       5.924297      28.073069       1.603466   
min         0.000000      11.000000       2.100000       0.000000   
25%         0.000000      24.200000      28.800000       1.000000   
50%         4.300000      28.000000      55.100000       1.900000   
75%       447.900000      32.300000      80.100000       3.100000   
max      1342.300000      43.800000     100.000000      19.500000   

              WSgust        WSstdev             WD        WDstdev  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean        2.809195       0.473390     153.435172       8.582407   
std         2.029120       0.273395     102.332842       6.385864   
min         0.000000       0.000000       0.000000       0.000000   
25%         1.300000       0.400000      59.000000       3.700000   
50%         2.600000       0.500000     181.000000       8.600000   
75%         4.100000       0.600000     235.100000      12.300000   
max        26.600000       4.200000     360.000000      99.400000   

                  BP       Cleaning  Precipitation          TModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      994.197199       0.000923       0.001905      35.246026   
std         2.474993       0.030363       0.037115      14.807258   
min       985.000000       0.000000       0.000000       9.000000   
25%       993.000000       0.000000       0.000000      24.200000   
50%       994.000000       0.000000       0.000000      30.000000   
75%       996.000000       0.000000       0.000000      46.900000   
max      1003.000000       1.000000       2.500000      81.000000   

               TModB  Comments  
count  525600.000000       0.0  
mean       32.471736       NaN  
std        12.348743       NaN  
min         8.100000       NaN  
25%        23.600000       NaN  
50%        28.900000       NaN  
75%        41.500000       NaN  
max        72.500000       NaN  

Sierra Leone Summary:
                 GHI            DNI            DHI           ModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      201.957515     116.376337     113.720571     206.643095   
std       298.495150     218.652659     158.946032     300.896893   
min       -19.500000      -7.800000     -17.900000       0.000000   
25%        -2.800000      -0.300000      -3.800000       0.000000   
50%         0.300000      -0.100000      -0.100000       3.600000   
75%       362.400000     107.000000     224.700000     359.500000   
max      1499.000000     946.000000     892.000000    1507.000000   

                ModB           Tamb             RH             WS  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      198.114691      26.319394      79.448857       1.146113   
std       288.889073       4.398605      20.520775       1.239248   
min         0.000000      12.300000       9.900000       0.000000   
25%         0.000000      23.100000      68.700000       0.000000   
50%         3.400000      25.300000      85.400000       0.800000   
75%       345.400000      29.400000      96.700000       2.000000   
max      1473.000000      39.900000     100.000000      19.200000   

              WSgust        WSstdev             WD        WDstdev  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean        1.691606       0.363823     133.044668       7.172220   
std         1.617053       0.295000     114.284792       7.535093   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         1.600000       0.400000     161.500000       6.200000   
75%         2.600000       0.600000     234.100000      12.000000   
max        23.900000       4.100000     360.000000      98.400000   

                  BP       Cleaning  Precipitation          TModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      999.876469       0.000967       0.004806      32.504263   
std         2.104419       0.031074       0.047556      12.434899   
min       993.000000       0.000000       0.000000      10.700000   
25%       999.000000       0.000000       0.000000      23.500000   
50%      1000.000000       0.000000       0.000000      26.600000   
75%      1001.000000       0.000000       0.000000      40.900000   
max      1006.000000       1.000000       2.400000      72.800000   

               TModB  Comments  
count  525600.000000       0.0  
mean       32.593091       NaN  
std        12.009161       NaN  
min        11.100000       NaN  
25%        23.800000       NaN  
50%        26.900000       NaN  
75%        41.300000       NaN  
max        70.400000       NaN  

Togo Summary:
                 GHI            DNI            DHI           ModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      230.555040     151.258469     116.444352     226.144375   
std       322.532347     250.956962     156.520714     317.346938   
min       -12.700000       0.000000       0.000000       0.000000   
25%        -2.200000       0.000000       0.000000       0.000000   
50%         2.100000       0.000000       2.500000       4.400000   
75%       442.400000     246.400000     215.700000     422.525000   
max      1424.000000    1004.500000     805.700000    1380.000000   

                ModB           Tamb             RH             WS  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      219.568588      27.751788      55.013160       2.368093   
std       307.932510       4.758023      28.778732       1.462668   
min         0.000000      14.900000       3.300000       0.000000   
25%         0.000000      24.200000      26.500000       1.400000   
50%         4.300000      27.200000      59.300000       2.200000   
75%       411.000000      31.100000      80.800000       3.200000   
max      1367.000000      41.400000      99.800000      16.100000   

              WSgust        WSstdev             WD        WDstdev  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean        3.229490       0.557740     161.741845      10.559568   
std         1.882565       0.268923      91.877217       5.915490   
min         0.000000       0.000000       0.000000       0.000000   
25%         1.900000       0.400000      74.800000       6.900000   
50%         2.900000       0.500000     199.100000      10.800000   
75%         4.400000       0.700000     233.500000      14.100000   
max        23.100000       4.700000     360.000000      86.900000   

                  BP       Cleaning  Precipitation          TModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      975.915242       0.000535       0.001382      32.444403   
std         2.153977       0.023116       0.026350      10.998334   
min       968.000000       0.000000       0.000000      13.100000   
25%       975.000000       0.000000       0.000000      23.900000   
50%       976.000000       0.000000       0.000000      28.400000   
75%       977.000000       0.000000       0.000000      40.600000   
max       983.000000       1.000000       2.300000      70.400000   

               TModB  Comments  
count  525600.000000       0.0  
mean       33.543330       NaN  
std        12.769277       NaN  
min        13.100000       NaN  
25%        23.600000       NaN  
50%        28.400000       NaN  
75%        43.000000       NaN  
max        94.600000       NaN  

Check missing values¶

In [6]:
print("Benin Missing Values:")
print(benin_df.isnull().sum())
print("\nSierra Leone Missing Values:")
print(sl_df.isnull().sum())
print("\nTogo Missing Values:")
print(tg_df.isnull().sum())
Benin Missing Values:
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64

Sierra Leone Missing Values:
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64

Togo Missing Values:
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64

Check for negative irradiance values (potential outliers)¶

In [7]:
key_columns = ['GHI', 'DNI', 'DHI']  # Adjust if columns differ
for col in key_columns:
    if col in benin_df.columns:
        print(f"\nBenin Negative {col}:", (benin_df[col] < 0).sum())
    if col in sl_df.columns:
        print(f"Sierra Leone Negative {col}:", (sl_df[col] < 0).sum())
    if col in tg_df.columns:
        print(f"Togo Negative {col}:", (tg_df[col] < 0).sum())
Benin Negative GHI: 258847
Sierra Leone Negative GHI: 261135
Togo Negative GHI: 257385

Benin Negative DNI: 275987
Sierra Leone Negative DNI: 266352
Togo Negative DNI: 0

Benin Negative DHI: 259182
Sierra Leone Negative DHI: 263128
Togo Negative DHI: 0

Convert Timestamp to datetime (adjust column name if different)¶

In [8]:
benin_df['Timestamp'] = pd.to_datetime(benin_df['Timestamp'])
sl_df['Timestamp'] = pd.to_datetime(sl_df['Timestamp'])
tg_df['Timestamp'] = pd.to_datetime(tg_df['Timestamp'])

Time series plot for GHI¶

In [9]:
plt.figure(figsize=(12, 6))
plt.plot(benin_df['Timestamp'], benin_df['GHI'], label='Benin GHI', alpha=0.7)
plt.plot(sl_df['Timestamp'], sl_df['GHI'], label='Sierra Leone GHI', alpha=0.7)
plt.plot(tg_df['Timestamp'], tg_df['GHI'], label='Togo GHI', alpha=0.7)
plt.xlabel('Timestamp')
plt.ylabel('GHI (W/m²)')
plt.title('Global Horizontal Irradiance Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image

Correlation heatmap for Benin¶

In [10]:
numeric_cols = benin_df.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(20, 20))
sns.heatmap(benin_df[numeric_cols].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap - Benin')
plt.show()
No description has been provided for this image

Correlation heatmap for Sierra Leone¶

In [11]:
numeric_cols_sl = sl_df.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(20, 20))
sns.heatmap(sl_df[numeric_cols_sl].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap - Sierra Leone')
plt.show()
No description has been provided for this image

Correlation heatmap for Togo¶

In [12]:
numeric_cols_tg = tg_df.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(20, 20))
sns.heatmap(tg_df[numeric_cols_tg].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap - Togo')
plt.show()
No description has been provided for this image

Correlation Analysis¶

  • Benin: [e.g., Strong correlation between GHI and Temp (0.8)]
  • Sierra Leone: [e.g., Moderate correlation between GHI and DNI (0.6)]
  • Togo: [e.g., Weak correlation between DHI and Temp (0.3)]

Handle missing values (forward fill)¶

In [13]:
benin_df = benin_df.ffill()
sl_df = sl_df.ffill()
tg_df = tg_df.ffill()

Remove negative irradiance values¶

In [14]:
for col in ['GHI', 'DNI', 'DHI']:  # Adjust if columns differ
    if col in benin_df.columns:
        benin_df[col] = benin_df[col].clip(lower=0)
    if col in sl_df.columns:
        sl_df[col] = sl_df[col].clip(lower=0)
    if col in tg_df.columns:
        tg_df[col] = tg_df[col].clip(lower=0)

Verify cleaning¶

In [15]:
print("Benin Missing After Cleaning:", benin_df.isnull().sum().sum())
print("Sierra Leone Missing After Cleaning:", sl_df.isnull().sum().sum())
print("Togo Missing After Cleaning:", tg_df.isnull().sum().sum())
Benin Missing After Cleaning: 525600
Sierra Leone Missing After Cleaning: 525600
Togo Missing After Cleaning: 525600

Combine GHI data for boxplot¶

In [16]:
ghi_data = pd.DataFrame({
    'Benin': benin_df['GHI'],
    'Sierra Leone': sl_df['GHI'],
    'Togo': tg_df['GHI']
})

Boxplot¶

In [17]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=ghi_data)
plt.title('GHI Distribution Across Sites')
plt.ylabel('GHI (W/m²)')
plt.show()
No description has been provided for this image

Data Cleaning and Comparison¶

  • Cleaning: Used forward fill (or interpolation) for missing values; clipped negative irradiance.
  • GHI Distribution: [e.g., Benin median GHI ~400 W/m², Togo shows outliers]

Task 2: EDA Summary¶

  • Data Overview: Loaded Benin, Sierra Leone, and Togo datasets (~46-50 MB each), containing solar metrics (e.g., GHI, DNI, DHI, Temp).
  • Data Quality:
    • Missing values: [e.g., Benin had 100 missing GHI values, filled with forward fill].
    • Outliers: [e.g., Togo had 50 negative GHI values, clipped to 0].
  • Visualizations:
    • Time series: GHI peaks midday, [e.g., Benin shows higher peaks].
    • Correlation: [e.g., GHI and Temp strongly correlated (0.8) in Benin].
    • Boxplot: [e.g., Togo has wider GHI range, more outliers].
  • Insights: [e.g., Benin may have better solar potential due to higher median GHI].
  • Next Steps: Consider daily GHI averages or outlier analysis for forecasting.

Group by Cleaning flag and compute average ModA, ModB¶

In [18]:
for df, name in [(benin_df, 'Benin'), (sl_df, 'Sierra Leone'), (tg_df, 'Togo')]:
    if 'Cleaning' in df.columns and 'ModA' in df.columns and 'ModB' in df.columns:
        cleaning_impact = df.groupby('Cleaning')[['ModA', 'ModB']].mean()
        print(f"{name} Cleaning Impact:\n", cleaning_impact)
        
        # Plot
        plt.figure(figsize=(8, 6))
        cleaning_impact.plot(kind='bar')
        plt.title(f'Average ModA and ModB by Cleaning Status - {name}')
        plt.ylabel('Average Value')
        plt.xlabel('Cleaning Status')
        plt.xticks(rotation=0)
        plt.legend()
        plt.tight_layout()
        plt.show()
    else:
        print(f"{name}: Missing 'Cleaning', 'ModA', or 'ModB' columns")
Benin Cleaning Impact:
                 ModA        ModB
Cleaning                        
0         236.524253  228.816071
1         307.229278  301.972165
<Figure size 800x600 with 0 Axes>
No description has been provided for this image
Sierra Leone Cleaning Impact:
                 ModA        ModB
Cleaning                        
0         206.578599  198.038150
1         273.309252  277.231102
<Figure size 800x600 with 0 Axes>
No description has been provided for this image
Togo Cleaning Impact:
                 ModA        ModB
Cleaning                        
0         225.979064  219.401351
1         535.186477  532.211744
<Figure size 800x600 with 0 Axes>
No description has been provided for this image

Updated correlation heatmap¶

In [19]:
corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
for df, name in [(benin_df, 'Benin'), (sl_df, 'Sierra Leone'), (tg_df, 'Togo')]:
    available_cols = [col for col in corr_cols if col in df.columns]
    if available_cols:
        plt.figure(figsize=(10, 8))
        sns.heatmap(df[available_cols].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title(f'Correlation Heatmap - {name}')
        plt.show()
    else:
        print(f"{name}: No correlation columns available")

# Scatter plots
scatter_pairs = [('WS', 'GHI'), ('WSgust', 'GHI'), ('WD', 'GHI'), ('RH', 'Tamb'), ('RH', 'GHI')]
for df, name in [(benin_df, 'Benin'), (sl_df, 'Sierra Leone'), (tg_df, 'Togo')]:
    for x_col, y_col in scatter_pairs:
        if x_col in df.columns and y_col in df.columns:
            plt.figure(figsize=(8, 6))
            sns.scatterplot(data=df, x=x_col, y=y_col, alpha=0.5)
            plt.title(f'{x_col} vs. {y_col} - {name}')
            plt.xlabel(x_col)
            plt.ylabel(y_col)
            plt.show()
        else:
            print(f"{name}: Missing {x_col} or {y_col}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [20]:
from windrose import WindroseAxes

Wind rose for WS and WD¶

In [21]:
for df, name in [(benin_df, 'Benin'), (sl_df, 'Sierra Leone'), (tg_df, 'Togo')]:
    if 'WS' in df.columns and 'WD' in df.columns:
        fig = plt.figure(figsize=(8, 8))
        ax = WindroseAxes.from_ax(fig=fig)
        ax.bar(df['WD'], df['WS'], normed=True, opening=0.8, edgecolor='white')
        ax.set_title(f'Wind Rose - {name}')
        plt.show()
    else:
        print(f"{name}: Missing WS or WD")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Histograms for GHI and WS¶

In [22]:
for df, name in [(benin_df, 'Benin'), (sl_df, 'Sierra Leone'), (tg_df, 'Togo')]:
    for col in ['GHI', 'WS']:
        if col in df.columns:
            plt.figure(figsize=(8, 6))
            sns.histplot(df[col], bins=30, kde=True)
            plt.title(f'{col} Distribution - {name}')
            plt.xlabel(col)
            plt.ylabel('Frequency')
            plt.show()
        else:
            print(f"{name}: Missing {col}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

RH influence on Tamb and GHI¶

In [24]:
for df, name in [(benin_df, 'Benin'), (sl_df, 'Sierra Leone'), (tg_df, 'Togo')]:
    if 'RH' in df.columns and 'Tamb' in df.columns and 'GHI' in df.columns:
        print(f"{name} RH vs. Tamb Correlation:", df['RH'].corr(df['Tamb']))
        print(f"{name} RH vs. GHI Correlation:", df['RH'].corr(df['GHI']))
        
        # Scatter with regression
        plt.figure(figsize=(8, 6))
        sns.regplot(data=df, x='RH', y='Tamb', scatter_kws={'alpha':0.5})
        plt.title(f'RH vs. Tamb with Regression - {name}')
        plt.show()
    else:
        print(f"{name}: Missing RH, Tamb, or GHI")
Benin RH vs. Tamb Correlation: -0.41484177497995545
Benin RH vs. GHI Correlation: -0.36177254586848995
No description has been provided for this image
Sierra Leone RH vs. Tamb Correlation: -0.791936652154722
Sierra Leone RH vs. GHI Correlation: -0.5493948346157755
No description has been provided for this image
Togo RH vs. Tamb Correlation: -0.4004104673516293
Togo RH vs. GHI Correlation: -0.2596836510748201
No description has been provided for this image

Bubble chart¶

In [25]:
for df, name in [(benin_df, 'Benin'), (sl_df, 'Sierra Leone'), (tg_df, 'Togo')]:
    if 'GHI' in df.columns and 'Tamb' in df.columns and 'RH' in df.columns:
        plt.figure(figsize=(10, 8))
        plt.scatter(df['GHI'], df['Tamb'], s=df['RH']*10, alpha=0.5)
        plt.title(f'GHI vs. Tamb (Bubble Size: RH) - {name}')
        plt.xlabel('GHI (W/m²)')
        plt.ylabel('Tamb (°C)')
        plt.show()
    else:
        print(f"{name}: Missing GHI, Tamb, or RH")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Task 2: Complete EDA Summary¶

  • Cleaning Impact: [e.g., Cleaning increased ModA by 10% in Benin].
  • Correlations: [e.g., GHI and TModA strongly correlated (0.8) in Togo].
  • Relationships: [e.g., WS vs. GHI shows no clear trend; RH negatively correlates with Tamb].
  • Wind: [e.g., Benin winds predominantly from 90°; WS skewed right].
  • Temperature: [e.g., High RH reduces Tamb in Sierra Leone].
  • Bubble Chart: [e.g., High RH at low GHI in Togo].
  • Insights: [e.g., Cleaning improves module performance; RH impacts temperature].
  • References:
    • Pandas: https://pandas.pydata.org/docs/
    • Seaborn: https://seaborn.pydata.org/
    • Windrose: https://windrose.readthedocs.io/
In [ ]: